library(DESeq2)
library(data.table)
library(gdata)
library(rtracklayer)
library(BSgenome)
library(VennDiagram)
library(ggplot2)
library(biomaRt)
library(RColorBrewer)
library(readr)
library(ensembldb)
library(EnsDb.Mmusculus.v79)
library(org.Mm.eg.db)
library(AnnotationDbi)
library(tidyverse)
library(plotly)
Set up the function
plotCDF.ggplot <- function(gene.counts, gene.sets, gene.set.labels,
col = "", linetype = "", xlim = c( -1.0, 1.3 ),
legend.size = 22, axistitle.size = 22, title = "Fold change log2 (Dicer KO/WT)",
legend.pos = c(0.7, 0.18)) {
require(ggplot2)
df.log2fc <- gene.counts[,c("gene", "log2FC")]
#rownames(df.log2fc) <- df.log2fc$gene
if (length(gene.sets) != length(gene.set.labels)){
return("Length of gene sets doesn't match labels")
}
target.expr <- df.log2fc[df.log2fc$gene %in% gene.sets[[1]],]
for (i in 2:length(gene.sets)){
target.expr <- rbind(target.expr, df.log2fc[df.log2fc$gene %in% gene.sets[[i]],])
}
gene.set.counts <- c()
for (j in 1:length(gene.sets)){
gene.set.counts <- c(gene.set.counts, sum(df.log2fc$gene %in% gene.sets[[j]]))
}
target.expr$Category <- rep(gene.set.labels, gene.set.counts)
target.expr$Category <- factor(target.expr$Category, levels = gene.set.labels)
log2FC.values <- lapply(gene.sets, function(gene.set) {
gene.counts[gene.counts$gene %in% gene.set,]$log2FC
})
ks.pvals <- lapply(log2FC.values,
function(log2FCs) {
ks.test(log2FCs, log2FC.values[[1]])$p.value
})
p <- ggplot( target.expr, aes( x = log2FC, colour = Category ) ) +
stat_ecdf( geom = 'step', aes( colour = Category, linetype = Category ), lwd = 1 ) +
scale_color_manual( values = col, labels = sprintf( "%s (%d)", gene.set.labels, gene.set.counts ) ) +
scale_linetype_manual( values = linetype, labels = sprintf( "%s (%d)", gene.set.labels, gene.set.counts ) ) +
# xlim() will remove data points; Be careful in the future
coord_cartesian( xlim = xlim ) + xlab(title) + ylab('CDF') +
theme_classic() + theme( legend.background = element_rect(fill = NA),
legend.title = element_blank(),
legend.position = legend.pos,
legend.text = element_text(size=legend.size),
legend.key.size = unit(1.5, 'lines'),
axis.title.x = element_text(size=axistitle.size, margin = margin(t = 10)),
axis.title.y = element_text(size=axistitle.size, margin = margin(r = 10)),
axis.text=element_text(size=20),
axis.line = element_line(size = 1), #axis label size
axis.ticks.length = unit(0.3, "cm")) #increase tick length
for (k in 2:length(gene.sets)){
p <- p + annotate(geom = "text", x = -0.9, y = 1-0.08*(k-1), hjust = 0,
label = sprintf("p = %.0e", ks.pvals[k]),
colour = col[k], size = 8)
}
print(p)
}
Load the enema model RNA-Seq DGE dataset.
rna_DGE <- read_csv("~/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/RNA-Seq/Mouse colon epithelium/Analysis/Differential Analysis.csv")
## Warning: Missing column names filled in: 'X1' [1]
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## X1 = col_character(),
## baseMean = col_double(),
## log2FoldChange = col_double(),
## lfcSE = col_double(),
## stat = col_double(),
## pvalue = col_double(),
## padj = col_double()
## )
Now load the list of peaks associated with each miRNA.
peaks.mir <- readRDS("Datafiles/merged-peaks-mirs-200-12232019.rds")
# First we need to nnotate each peak with its potential target gene, 3'UTR annotation gets priority.
peaks.mir$'target_gene' <- NA
for (i in 1:length(peaks.mir)) {
if (!is.na(peaks.mir$utr3[i]) | !is.na(peaks.mir$`utr3*`[i])) {
gene_name <- unique(c(peaks.mir$utr3[i],peaks.mir$`utr3*`[i]))
gene_name <- gene_name[!is.na(gene_name)]
peaks.mir$'target_gene'[i] <- paste(unlist(gene_name), collapse = " ")
}
else {
gene_name <- unique(c(peaks.mir$exon[i], peaks.mir$intron[i],peaks.mir$utr5[i],peaks.mir$`utr5*`[i]))
gene_name <- gene_name[!is.na(gene_name)]
if (length(gene_name) >0) {
peaks.mir$'target_gene'[i] <- paste(unlist(gene_name), collapse = " ")
}
}
}
Now I need to annotate the target list with both Ensembl ID and Uniprot ID and Enterez ID (for KEGG and GSEA).
# annotate with Ensembl ID
# Ensembl IDs are annotated using `EnsDb.Mmusculus.v79` package since that is the one that I used for RNA-Seq analysis
target_gene_list <- peaks.mir$target_gene[!is.na(peaks.mir$target_gene)]
annotations_ensembl <- AnnotationDbi::select(EnsDb.Mmusculus.v79,
keys = as.character(target_gene_list),
columns = c("GENEID"),
keytype = "GENENAME")
# Determine the indices for the non-duplicated genes
non_duplicates_idx <- which(duplicated(annotations_ensembl$GENEID) == FALSE)
# Return only the non-duplicated genes using indices
annotations_ensembl <- annotations_ensembl[non_duplicates_idx, ]
# Check number of NAs returned
is.na(annotations_ensembl$GENENAME) %>%
which() %>%
length()
## [1] 0
# annotate the dataset with Ensembl ID
peaks.mir$target_Ensembl_ID <- NA
peaks.mir <- peaks.mir[!is.na(peaks.mir$target_gene)]
for (i in 1:length(peaks.mir)) {
index <- grep(peaks.mir$target_gene[i], annotations_ensembl$GENENAME)
if (length(index)>0) {
peaks.mir$target_Ensembl_ID[i] <- paste(annotations_ensembl$GENEID[index], collapse = " ")
}
}
# annotate the dataset with Uniprot ID
## first I need to generate a list of genes to retrieve Uniprot ID from the website
peak.targets <- unique(peaks.mir$target_gene)
write_csv(as.data.frame(peak.targets), "all_merged_peaks.csv", col_names = FALSE)
uniprot_ID <- read_csv("all_merged_peaks_Uniprot.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## gene_name = col_character(),
## Entry = col_character(),
## `Entry name` = col_character(),
## Status = col_character(),
## `Protein names` = col_character(),
## `Gene names` = col_character(),
## Organism = col_character(),
## Length = col_double()
## )
peaks.mir$target_Uniprot_ID <- NA
for (i in 1:length(peaks.mir)) {
index <- grep(peaks.mir$target_gene[i], uniprot_ID$gene_name)
if (length(index)>0) {
peaks.mir$target_Uniprot_ID[i] <- paste(uniprot_ID$Entry[index], collapse = " ")
}
}
## filter out targets with no ID
no_id <- as.data.frame(setdiff(peak.targets, uniprot_ID$gene_name))
colnames(no_id) <- "gene_name"
## annotate with previously manully annotated no_id_2_id list from HF and HFK individually
hf_id <- read_csv("no.UniprotID.list_2_Uniprot_HF.csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## gene_names = col_character(),
## UniprotID = col_character()
## )
hfk_id <- read_csv("no.UniprotID.list_2_Uniprot_HFK.csv", col_names = FALSE)
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## X1 = col_character(),
## X2 = col_character()
## )
no_id <- no_id %>% left_join(hf_id, by = c("gene_name" = "gene_names")) %>% left_join(hfk_id, by = c("gene_name" = "X1"))
for (i in 1:dim(no_id)[1]) {
id <- unique(unlist(no_id[i,c(2,3)]))
id <- id[!is.na(id)]
if (length(id) == 1) {
no_id$UniprotID[i] <- id
}
else if (length(id) > 1) {
id <- paste(id, collapse = " ")
no_id$UniprotID[i] <- id
}
}
no_id <- no_id[,-3]
write_csv(no_id, "merged_no_UniprotID.csv", na = "", col_names = FALSE)
## load the manually annotated ID list
no.id.2.id <- read_csv("merged_no_UniprotID_2_ID.csv", col_names = FALSE)
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## X1 = col_character(),
## X2 = col_character()
## )
for (i in 1:dim(no.id.2.id)[1]) {
index <- grep(no.id.2.id$X1[i], peaks.mir$target_gene)
if (length(index)>0) {
peaks.mir$target_Uniprot_ID[index] <- paste(no.id.2.id$X2[i], collapse = " ")
}
}
# number of genes with UniprotID
length(unique(peaks.mir$target_Uniprot_ID[!is.na(peaks.mir$target_Uniprot_ID)]))
## [1] 2431
# annotate with Entrez ID
# Entrez IDs are annotated using `org.Mm.eg.db` package since this is the most updated
annotations_entrez <- AnnotationDbi::select(org.Mm.eg.db,
keys = as.character(target_gene_list),
columns = c("ENTREZID"),
keytype = "SYMBOL")
## 'select()' returned many:1 mapping between keys and columns
# Determine the indices for the non-duplicated genes
non_duplicates_idx <- which(duplicated(annotations_entrez$ENTREZID) == FALSE)
# Return only the non-duplicated genes using indices
annotations_entrez <- annotations_entrez[non_duplicates_idx, ]
# Check number of NAs returned
is.na(annotations_entrez$ENTREZID) %>%
which() %>%
length()
## [1] 1
# annotate the dataset with Entrez ID
peaks.mir$target_Entrez_ID <- NA
peaks.mir <- peaks.mir[!is.na(peaks.mir$target_gene)]
for (i in 1:length(peaks.mir)) {
index <- grep(peaks.mir$target_gene[i], annotations_entrez$SYMBOL)
if (length(index)>0) {
peaks.mir$target_Entrez_ID[i] <- paste(annotations_entrez$ENTREZID[index], collapse = " ")
}
}
# save the datafile
saveRDS(peaks.mir, "Datafiles/merged-peaks-mirs-200-12232019-withID.rds")
Now we group peaks by miRNA.
targetofmiR <- function(peaks.mir = brain.peaks.mirs,
miRNA = "",
sitetype = "8mer"){
peaks.mir.sub <- as.data.frame(peaks.mir[,c("hfk.hf.log2FC", "hfk.hf.padj",
"seed.8mer", "seed.7m8", "seed.7A1", "seed.6mer")])
peaks.seedmatch <- lapply(c("seed.8mer", "seed.7m8", "seed.7A1", "seed.6mer"),
function(seed){
map <- peaks.mir.sub[grepl(miRNA, peaks.mir.sub[,seed]),]
map <- rownames(map)
map
})
names(peaks.seedmatch) <- c("seed.8mer", "seed.7m8", "seed.7A1", "seed.6mer")
if (sitetype == "8mer"){
maps <- peaks.seedmatch[[1]]
} else if (sitetype == "7mer_above"){
maps <- unique(unlist(peaks.seedmatch[1:3]))
} else if (sitetype == "7mer"){
maps <- unique(unlist(peaks.seedmatch[2:3]))
} else if (sitetype == "6mer"){
maps <- peaks.seedmatch[[4]]
} else {
print("Please input site type as: 8mer, 7mer_above, 7mer or 6mer")
}
return(peaks.mir[maps])
}
mirna.family.DGE <- readRDS("Datafiles/mirna-counts-deseq-by-family-12232019.rds")
mirs <- subset(mirna.family.DGE, baseMean > 200)
mirs <- mirs$miR.family
mirs.peaks <- lapply(mirs,
function(mir){
targetofmiR(miRNA = mir,
peaks.mir = peaks.mir,
sitetype = "7mer_above")
})
names(mirs.peaks) <- mirs
saveRDS(mirs.peaks, "Datafiles/miRNA-merged-peaks-list-12232019-withIDs.rds")
Now we can draw the CDF plots for specific miRNAs. I will do the top 10 that have the most peaks associated.
colnames(rna_DGE)[1] <- "gene"
colnames(rna_DGE)[3] <- "log2FC"
cols <- c(brewer.pal(name = "Set2", n = 8), brewer.pal(name = "Set3", n = 3))
plotCDF.ggplot(rna_DGE,
list(rna_DGE$gene, peaks.mir$target_Ensembl_ID),
c("All genes", "miRNAs over 200"),
col = c("grey15", cols[1]),
linetype = c(1, 1),
title = "RNA_LFC(K-Ras_G12D/WT)"
)
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
dir.create("PDF_figure/CDF_analysis_merged", showWarnings = FALSE)
pdf("PDF_figure/CDF_analysis_merged/RNA_CDF_miRNAover200.pdf",
width = 10,
height = 8)
plotCDF.ggplot(rna_DGE,
list(rna_DGE$gene, peaks.mir$target_Ensembl_ID),
c("All genes", "miRNAs over 200"),
col = c("grey15", cols[1]),
linetype = c(1, 1),
title = "RNA_LFC(K-Ras_G12D/WT)"
)
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
dev.off()
## quartz_off_screen
## 2
len <- sapply(mirs.peaks, function(x) length(x))
mirs.peaks <- mirs.peaks[order(-len)]
mirna <- names(mirs.peaks)
for (i in 1:10) {
plotCDF.ggplot(rna_DGE,
list(rna_DGE$gene, mirs.peaks[[i]]$target_Ensembl_ID),
c("All genes", mirna[i]),
col = c("grey15", cols[i+1]),
linetype = c(1, 1),
title = "RNA_LFC(K-Ras_G12D/WT)"
)
}
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
pdf("PDF_figure/CDF_analysis_merged/RNA_CDF_top10_active_miRNA.pdf",
width = 10,
height = 8)
for (i in 1:10) {
plotCDF.ggplot(rna_DGE,
list(rna_DGE$gene, mirs.peaks[[i]]$target_Ensembl_ID),
c("All genes", mirna[i]),
col = c("grey15", cols[i+1]),
linetype = c(1, 1),
title = "RNA_LFC(K-Ras_G12D/WT)"
)
}
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
dev.off()
## quartz_off_screen
## 2
# define the function
plotBoxplot.ggplot <- function(gene.counts, gene.sets, gene.set.labels) {
require(ggplot2)
df.log2fc <- gene.counts[,c("gene", "log2FC")]
#rownames(df.log2fc) <- df.log2fc$gene
if (length(gene.sets) != length(gene.set.labels)){
return("Length of gene sets doesn't match labels")
}
target.expr <- df.log2fc[df.log2fc$gene %in% gene.sets[[1]],]
for (i in 2:length(gene.sets)){
target.expr <- rbind(target.expr, df.log2fc[df.log2fc$gene %in% gene.sets[[i]],])
}
gene.set.counts <- c()
for (j in 1:length(gene.sets)){
gene.set.counts <- c(gene.set.counts, sum(df.log2fc$gene %in% gene.sets[[j]]))
}
target.expr$Category <- rep(gene.set.labels, gene.set.counts)
target.expr$Category <- factor(target.expr$Category, levels = gene.set.labels)
p <- ggplot(target.expr, aes(x=Category, y=log2FC, fill = Category)) + geom_boxplot() + xlab("") +
ylab("log2FC(KRas-G12D/WT)") + scale_fill_manual(values = cols[1:length(gene.sets)]) + theme(axis.text.x =element_text(angle = 90, hjust = 1))
print(p)
}
#get a list of gene sets from the top 10 miRNA
gene.list <- c()
for (i in 1:10) {
gene.list <- c(gene.list, list(mirs.peaks[[i]]$target_Ensembl_ID))
}
plotBoxplot.ggplot(rna_DGE,
c(list(rna_DGE$gene), gene.list),
c("All genes", mirna[1:10])
) + ylim(-2,2)
## Warning: Removed 127 rows containing non-finite values (stat_boxplot).
pdf("PDF_figure/CDF_analysis_merged/RNA_Boxplot_top10_active_miRNA.pdf",
width = 10,
height = 8)
plotBoxplot.ggplot(rna_DGE,
c(list(rna_DGE$gene), gene.list),
c("All genes", mirna[1:10])
) + ylim(-2,2)
## Warning: Removed 127 rows containing non-finite values (stat_boxplot).
dev.off()
## quartz_off_screen
## 2
Load the proteomic dataset
protein_DGE <- read_csv("~/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/Proteomics data/scraped colon/ceMS_diff.csv")
## Warning: Missing column names filled in: 'X1' [1]
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## X1 = col_double(),
## `Protein Id` = col_character(),
## X2 = col_character(),
## `Gene Symbol` = col_character(),
## p_values = col_double(),
## q_values = col_double(),
## foldChange = col_double(),
## LFC = col_double()
## )
colnames(protein_DGE)[2] <- "gene"
colnames(protein_DGE)[8] <- "log2FC"
cols <- c(brewer.pal(name = "Set2", n = 8), brewer.pal(name = "Set3", n = 3))
plotCDF.ggplot(protein_DGE,
list(protein_DGE$gene, peaks.mir$target_Uniprot_ID),
c("All genes", "miRNAs over 200"),
col = c("grey15", cols[1]),
linetype = c(1, 1),
title = "Protein_LFC(K-Ras_G12D/WT)"
)
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
pdf("PDF_figure/CDF_analysis_merged/Protein_CDF_miRNAover200.pdf",
width = 10,
height = 8)
plotCDF.ggplot(protein_DGE,
list(protein_DGE$gene, peaks.mir$target_Uniprot_ID),
c("All genes", "miRNAs over 200"),
col = c("grey15", cols[1]),
linetype = c(1, 1),
title = "Protein_LFC(K-Ras_G12D/WT)"
)
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
dev.off()
## quartz_off_screen
## 2
for (i in 1:10) {
plotCDF.ggplot(protein_DGE,
list(protein_DGE$gene, mirs.peaks[[i]]$target_Uniprot_ID),
c("All genes", mirna[i]),
col = c("grey15", cols[i+1]),
linetype = c(1, 1),
title = "Protein_LFC(K-Ras_G12D/WT)"
)
}
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
pdf("PDF_figure/CDF_analysis_merged/Protein_CDF_top10_active_miRNA.pdf",
width = 10,
height = 8)
for (i in 1:10) {
plotCDF.ggplot(protein_DGE,
list(protein_DGE$gene, mirs.peaks[[i]]$target_Uniprot_ID),
c("All genes", mirna[i]),
col = c("grey15", cols[i+1]),
linetype = c(1, 1),
title = "Protein_LFC(K-Ras_G12D/WT)"
)
}
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
dev.off()
## quartz_off_screen
## 2
# get a list of gene sets from the top 10 miRNA
gene.list <- c()
for (i in 1:10) {
gene.list <- c(gene.list, list(mirs.peaks[[i]]$target_Uniprot_ID))
}
plotBoxplot.ggplot(protein_DGE,
c(list(protein_DGE$gene), gene.list),
c("All genes", mirna[1:10])
) + ylim(-1,1)
## Warning: Removed 365 rows containing non-finite values (stat_boxplot).
pdf("PDF_figure/CDF_analysis_merged/Protein_Boxplot_top10_active_miRNA.pdf",
width = 10,
height = 8)
plotBoxplot.ggplot(protein_DGE,
c(list(protein_DGE$gene), gene.list),
c("All genes", mirna[1:10])
) + ylim(-1,1)
## Warning: Removed 365 rows containing non-finite values (stat_boxplot).
dev.off()
## quartz_off_screen
## 2
This analysis will use the LFC of CLIP peak heights between HVAK and HVA samples.
clip_DGE <- as.data.frame(cbind(peaks.mir$name, peaks.mir$hfk.hf.log2FC))
colnames(clip_DGE)[1] <- "gene"
colnames(clip_DGE)[2] <- "log2FC"
clip_DGE$log2FC <- as.numeric(as.character(clip_DGE$log2FC))
cols <- c(brewer.pal(name = "Set2", n = 8), brewer.pal(name = "Set3", n = 3))
for (i in 1:10) {
plotCDF.ggplot(clip_DGE,
list(clip_DGE$gene, mirs.peaks[[i]]$name),
c("All peaks", mirna[i]),
col = c("grey15", cols[i]),
linetype = c(1, 1),
title = "HEAP-CLIP_LFC(K-Ras_G12D/WT)",
xlim = c(-8,8)
)
}
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning: Removed 51 rows containing non-finite values (stat_ecdf).
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning: Removed 52 rows containing non-finite values (stat_ecdf).
## Warning: p-value will be approximate in the presence of ties
## Warning: p-value will be approximate in the presence of ties
## Warning: Removed 49 rows containing non-finite values (stat_ecdf).
## Warning: p-value will be approximate in the presence of ties
## Warning: p-value will be approximate in the presence of ties
## Warning: Removed 49 rows containing non-finite values (stat_ecdf).
## Warning: p-value will be approximate in the presence of ties
## Warning: p-value will be approximate in the presence of ties
## Warning: Removed 49 rows containing non-finite values (stat_ecdf).
## Warning: p-value will be approximate in the presence of ties
## Warning: p-value will be approximate in the presence of ties
## Warning: Removed 50 rows containing non-finite values (stat_ecdf).
## Warning: p-value will be approximate in the presence of ties
## Warning: p-value will be approximate in the presence of ties
## Warning: Removed 49 rows containing non-finite values (stat_ecdf).
## Warning: p-value will be approximate in the presence of ties
## Warning: p-value will be approximate in the presence of ties
## Warning: Removed 49 rows containing non-finite values (stat_ecdf).
## Warning: p-value will be approximate in the presence of ties
## Warning: p-value will be approximate in the presence of ties
## Warning: Removed 49 rows containing non-finite values (stat_ecdf).
## Warning: p-value will be approximate in the presence of ties
## Warning: p-value will be approximate in the presence of ties
## Warning: Removed 49 rows containing non-finite values (stat_ecdf).
pdf("PDF_figure/CDF_analysis_merged/CLIP_CDF_top10_active_miRNA.pdf",
width = 10,
height = 8)
for (i in 1:10) {
plotCDF.ggplot(clip_DGE,
list(clip_DGE$gene, mirs.peaks[[i]]$name),
c("All peaks", mirna[i]),
col = c("grey15", cols[i+1]),
linetype = c(1, 1),
title = "HEAP-CLIP_LFC(K-Ras_G12D/WT)",
xlim = c(-8,8)
)
}
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning: Removed 51 rows containing non-finite values (stat_ecdf).
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning: Removed 52 rows containing non-finite values (stat_ecdf).
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning: Removed 49 rows containing non-finite values (stat_ecdf).
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning: Removed 49 rows containing non-finite values (stat_ecdf).
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning: Removed 49 rows containing non-finite values (stat_ecdf).
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning: Removed 50 rows containing non-finite values (stat_ecdf).
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning: Removed 49 rows containing non-finite values (stat_ecdf).
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning: Removed 49 rows containing non-finite values (stat_ecdf).
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning: Removed 49 rows containing non-finite values (stat_ecdf).
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning in ks.test(log2FCs, log2FC.values[[1]]): p-value will be approximate in
## the presence of ties
## Warning: Removed 49 rows containing non-finite values (stat_ecdf).
dev.off()
## quartz_off_screen
## 2
# get a list of gene sets from the top 10 miRNA
gene.list <- c()
for (i in 1:10) {
gene.list <- c(gene.list, list(mirs.peaks[[i]]$name))
}
plotBoxplot.ggplot(clip_DGE,
c(list(clip_DGE$gene), gene.list),
c("All genes", mirna[1:10])
)
## Warning: Removed 55 rows containing non-finite values (stat_boxplot).
pdf("PDF_figure/CDF_analysis_merged/CLIP_Boxplot_top10_active_miRNA.pdf",
width = 10,
height = 8)
plotBoxplot.ggplot(clip_DGE,
c(list(clip_DGE$gene), gene.list),
c("All genes", mirna[1:10])
)
## Warning: Removed 55 rows containing non-finite values (stat_boxplot).
dev.off()
## quartz_off_screen
## 2
# filter for miRNA with mean counts > 200
mirna.family.DGE <- mirna.family.DGE[mirna.family.DGE$baseMean>200,]
mirna.family.DGE$peak.number <- NA
for (i in 1:length(mirna.family.DGE$miR.family)) {
peak.number <- length(mirs.peaks[[i]])
index <- grep(names(mirs.peaks)[i], mirna.family.DGE$miR.family)
mirna.family.DGE$peak.number[index] <- peak.number
}
p1 <- ggplot(mirna.family.DGE, aes(x = log2(baseMean), y = peak.number, label = miR.family, size = peak.number)) +
geom_point(colour = "#EC469A", alpha = 0.6) +
geom_hline(yintercept = 0, linetype = "dashed", size = 0.3) +
geom_vline(xintercept = 0, linetype = "dashed", size = 0.3) +
xlab("Log2(Ago2-bound miRNA abundance)") +
ylab("Number of HEAP-CLIP peaks mapped") +
theme_bw() +
theme(panel.border = element_blank(),
panel.background = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.title.x = element_text(size=12, margin = margin(t = 10)),
axis.title.y = element_text(size=12, margin = margin(r = 10)),
axis.text = element_text(size=10),
axis.line.y = element_line(size = 0.5),
axis.line.x = element_line(size = 0.5),
axis.ticks.x = element_line(size = 0),
axis.ticks.y = element_line(size = 0.5),
plot.title = element_text(hjust = 0.5, size = 12, face = "bold")) +
xlim(5,18)
p2 <- ggplotly(p1)
p2
pdf("PDF_figure/CDF_analysis_merged/Correlation_miRNAabundance_peaknumber.pdf",
width = 6,
height = 4)
p1
## Warning: Removed 1 rows containing missing values (geom_vline).
dev.off()
## quartz_off_screen
## 2
# calculate the correlation between these two factors
cor.test(log10(mirna.family.DGE$baseMean), mirna.family.DGE$peak.number)
##
## Pearson's product-moment correlation
##
## data: log10(mirna.family.DGE$baseMean) and mirna.family.DGE$peak.number
## t = 5.4817, df = 73, p-value = 5.733e-07
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3567663 0.6832282
## sample estimates:
## cor
## 0.5400028
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Catalina 10.15.4
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] grid parallel stats4 stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] plotly_4.9.4 forcats_0.5.1
## [3] stringr_1.4.0 dplyr_1.0.6
## [5] purrr_0.3.4 tidyr_1.1.3
## [7] tibble_3.1.2 tidyverse_1.3.1
## [9] org.Mm.eg.db_3.11.4 EnsDb.Mmusculus.v79_2.99.0
## [11] ensembldb_2.12.1 AnnotationFilter_1.12.0
## [13] GenomicFeatures_1.40.1 AnnotationDbi_1.50.3
## [15] readr_1.4.0 RColorBrewer_1.1-2
## [17] biomaRt_2.44.4 ggplot2_3.3.3
## [19] VennDiagram_1.6.20 futile.logger_1.4.3
## [21] BSgenome_1.56.0 Biostrings_2.56.0
## [23] XVector_0.28.0 rtracklayer_1.48.0
## [25] gdata_2.18.0 data.table_1.14.0
## [27] DESeq2_1.28.1 SummarizedExperiment_1.18.2
## [29] DelayedArray_0.14.1 matrixStats_0.59.0
## [31] Biobase_2.48.0 GenomicRanges_1.40.0
## [33] GenomeInfoDb_1.24.2 IRanges_2.22.2
## [35] S4Vectors_0.26.1 BiocGenerics_0.34.0
##
## loaded via a namespace (and not attached):
## [1] colorspace_2.0-1 ellipsis_0.3.2 fs_1.5.0
## [4] rstudioapi_0.13 farver_2.1.0 bit64_4.0.5
## [7] lubridate_1.7.10 fansi_0.5.0 xml2_1.3.2
## [10] splines_4.0.3 cachem_1.0.5 geneplotter_1.66.0
## [13] knitr_1.33 jsonlite_1.7.2 Rsamtools_2.4.0
## [16] broom_0.7.7 annotate_1.66.0 dbplyr_2.1.1
## [19] compiler_4.0.3 httr_1.4.2 backports_1.2.1
## [22] assertthat_0.2.1 Matrix_1.3-4 fastmap_1.1.0
## [25] lazyeval_0.2.2 cli_2.5.0 formatR_1.11
## [28] htmltools_0.5.1.1 prettyunits_1.1.1 tools_4.0.3
## [31] gtable_0.3.0 glue_1.4.2 GenomeInfoDbData_1.2.3
## [34] rappdirs_0.3.3 Rcpp_1.0.6 cellranger_1.1.0
## [37] jquerylib_0.1.4 vctrs_0.3.8 crosstalk_1.1.1
## [40] xfun_0.23 rvest_1.0.0 lifecycle_1.0.0
## [43] gtools_3.9.2 XML_3.99-0.6 zlibbioc_1.34.0
## [46] scales_1.1.1 hms_1.1.0 ProtGenerics_1.20.0
## [49] lambda.r_1.2.4 yaml_2.2.1 curl_4.3.1
## [52] memoise_2.0.0 sass_0.4.0 stringi_1.6.2
## [55] RSQLite_2.2.7 highr_0.9 genefilter_1.70.0
## [58] BiocParallel_1.22.0 rlang_0.4.11 pkgconfig_2.0.3
## [61] bitops_1.0-7 evaluate_0.14 lattice_0.20-44
## [64] labeling_0.4.2 htmlwidgets_1.5.3 GenomicAlignments_1.24.0
## [67] bit_4.0.4 tidyselect_1.1.1 magrittr_2.0.1
## [70] R6_2.5.0 generics_0.1.0 DBI_1.1.1
## [73] haven_2.4.1 pillar_1.6.1 withr_2.4.2
## [76] survival_3.2-11 RCurl_1.98-1.3 modelr_0.1.8
## [79] crayon_1.4.1 futile.options_1.0.1 utf8_1.2.1
## [82] BiocFileCache_1.12.1 rmarkdown_2.9 progress_1.2.2
## [85] readxl_1.3.1 locfit_1.5-9.4 blob_1.2.1
## [88] reprex_2.0.0 digest_0.6.27 xtable_1.8-4
## [91] openssl_1.4.4 munsell_0.5.0 viridisLite_0.4.0
## [94] bslib_0.2.5.1 askpass_1.1